from IPython.display import Image
Valores missing, outlier y correlaciones¶
En este notebook se realiza el estudio y preprocesamiento de las variables numéricas y categoricas. Se realizarán los siguientes pasos:
1. Cambio de tipos de variables
2. Separación en train y test
3. Análisis de cada variable con gráficos descriptivos
4. Para variables numericas: correlaciones de pearson, estudio de outliers y estudio de valores missing
5. Para variables categoricas: relleno de valores missing, estudio de correlaciones con vCramer
Para los valores outlier por columnas, se tendrá en cuenta los gráficos:
Dentro del tratamiento de los valores missing, se elegirá alguno de los siguientes métodos:
Importo librerías¶
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.impute import KNNImputer
import scipy.stats as ss
import warnings
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)
Voy guardando las funciones que están automatizadas y pienso que me van a servir en otros proyectos en un funciones_auxiliares.py y lo importo:
#import funciones_auxiliares as f_aux
# Si importas las funciones del .py de esta forma, para llamar a una función habría que hacer: f_aux.plot_feature(...)
# y eliminar la siguiente celda de funciones, ya que ya las estaríamos importando desde funciones_auxiliares.py
def plot_feature(df, col_name, isContinuous, target):
"""
Visualize a variable with and without faceting on the fraud.
- df dataframe
- col_name is the variable name in the dataframe
- full_name is the full variable name
- continuous is True if the variable is continuous, False otherwise
"""
f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
count_null = df[col_name].isnull().sum()
if isContinuous:
sns.histplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
else:
sns.countplot(df, x=col_name, color='#5975A4', saturation=1, ax=ax1)
ax1.set_xlabel(col_name)
ax1.set_ylabel('Count')
ax1.set_title(col_name+ ' Numero de nulos: '+str(count_null))
plt.xticks(rotation = 90)
if isContinuous:
sns.boxplot(x=col_name, y=target, data=df, ax=ax2)
ax2.set_ylabel('')
ax2.set_title(col_name + ' by '+target)
else:
data = df.groupby(col_name)[target].value_counts(normalize=True).to_frame('proportion').reset_index()
data.columns = [i, target, 'proportion']
#sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
ax2.set_ylabel(target+' fraction')
ax2.set_title(target)
plt.xticks(rotation = 90)
ax2.set_xlabel(col_name)
plt.tight_layout()
def dame_variables_categoricas(dataset=None):
'''
----------------------------------------------------------------------------------------------------------
Función dame_variables_categoricas:
----------------------------------------------------------------------------------------------------------
-Descripción: Función que recibe un dataset y devuelve una lista con los nombres de las
variables categóricas
-Inputs:
-- dataset: Pandas dataframe que contiene los datos
-Return:
-- lista_variables_categoricas: lista con los nombres de las variables categóricas del
dataset de entrada con menos de 100 valores diferentes
-- 1: la ejecución es incorrecta
'''
if dataset is None:
print(u'\nFaltan argumentos por pasar a la función')
return 1
lista_variables_categoricas = []
other = []
for i in dataset.columns:
if (dataset[i].dtype!=float) & (dataset[i].dtype!=int):
unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
if unicos < 100:
lista_variables_categoricas.append(i)
else:
other.append(i)
return lista_variables_categoricas, other
def get_corr_matrix(dataset = None, metodo='pearson', size_figure=[10,8]):
# Para obtener la correlación de Spearman, sólo cambiar el metodo por 'spearman'
if dataset is None:
print(u'\nHace falta pasar argumentos a la función')
return 1
sns.set(style="white")
# Compute the correlation matrix
corr = dataset.corr(method=metodo)
# Set self-correlation to zero to avoid distraction
for i in range(corr.shape[0]):
corr.iloc[i, i] = 0
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=size_figure)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, center=0,
square=True, linewidths=.5, cmap ='viridis' ) #cbar_kws={"shrink": .5}
plt.show()
return 0
def get_deviation_of_mean_perc(pd_fraud, list_var_continuous, target, multiplier):
"""
Devuelve el porcentaje de valores que exceden del intervalo de confianza
:type series:
:param multiplier:
:return:
"""
pd_final = pd.DataFrame()
for i in list_var_continuous:
series_mean = pd_fraud[i].mean()
series_std = pd_fraud[i].std()
std_amp = multiplier * series_std
left = series_mean - std_amp
right = series_mean + std_amp
size_s = pd_fraud[i].size
perc_goods = pd_fraud[i][(pd_fraud[i] >= left) & (pd_fraud[i] <= right)].size/size_s
perc_excess = pd_fraud[i][(pd_fraud[i] < left) | (pd_fraud[i] > right)].size/size_s
if perc_excess>0:
pd_concat_percent = pd.DataFrame(pd_fraud[target][(pd_fraud[i] < left) | (pd_fraud[i] > right)]\
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
pd_concat_percent.iloc[0,1]]
pd_concat_percent = pd_concat_percent.drop('fraud_bool',axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_outlier_values'] = pd_fraud[i][(pd_fraud[i] < left) | (pd_fraud[i] > right)].size
pd_concat_percent['porcentaje_sum_null_values'] = perc_excess
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
def get_percent_null_values_target(pd_fraud, list_var_continuous, target):
pd_final = pd.DataFrame()
for i in list_var_continuous:
if pd_fraud[i].isnull().sum()>0:
pd_concat_percent = pd.DataFrame(pd_fraud[target][pd_fraud[i].isnull()]\
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
pd_concat_percent.iloc[0,1]]
pd_concat_percent = pd_concat_percent.drop('fraud_bool',axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_null_values'] = pd_fraud[i].isnull().sum()
pd_concat_percent['porcentaje_sum_null_values'] = pd_fraud[i].isnull().sum()/pd_fraud.shape[0]
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
def cramers_v(confusion_matrix):
"""
calculate Cramers V statistic for categorial-categorial association.
uses correction from Bergsma and Wicher,
Journal of the Korean Statistical Society 42 (2013): 323-328
confusion_matrix: tabla creada con pd.crosstab()
"""
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum()
phi2 = chi2 / n
r, k = confusion_matrix.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
Lectura de datos del preprocesado inicial¶
Lectura de los datos y cambio de tipos de variables
pd_fraud = pd.read_csv("data/Base.csv")
pd_fraud.shape
(1000000, 32)
pd_fraud.columns
Index(['fraud_bool', 'income', 'name_email_similarity',
'prev_address_months_count', 'current_address_months_count',
'customer_age', 'days_since_request', 'intended_balcon_amount',
'payment_type', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
'velocity_4w', 'bank_branch_count_8w',
'date_of_birth_distinct_emails_4w', 'employment_status',
'credit_risk_score', 'email_is_free', 'housing_status',
'phone_home_valid', 'phone_mobile_valid', 'bank_months_count',
'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source',
'session_length_in_minutes', 'device_os', 'keep_alive_session',
'device_distinct_emails_8w', 'device_fraud_count', 'month'],
dtype='object')
Observamos que dentro de nuestro dataset hay tanto variables categóricas (tal y como se ha visto en el notebook 01), variables float (continuas) y variables int (números enteros)
lista_variables_categoricas = ["payment_type","employment_status","housing_status","email_is_free","phone_home_valid",
"phone_mobile_valid","has_other_cards","foreign_request","source","device_os","keep_alive_session", "fraud_bool"]
for columna in lista_variables_categoricas:
pd_fraud[columna] = pd_fraud[columna].astype('category')
lista_variables_categoricas
['payment_type', 'employment_status', 'housing_status', 'email_is_free', 'phone_home_valid', 'phone_mobile_valid', 'has_other_cards', 'foreign_request', 'source', 'device_os', 'keep_alive_session', 'fraud_bool']
A continuación transformamos 5 de las variables a formato decimal.
pd_fraud['prev_address_months_count']= pd_fraud['prev_address_months_count'].astype(float)
pd_fraud['current_address_months_count']= pd_fraud['current_address_months_count'].astype(float)
pd_fraud['zip_count_4w']= pd_fraud['zip_count_4w'].astype(float)
pd_fraud['bank_branch_count_8w']= pd_fraud['bank_branch_count_8w'].astype(float)
pd_fraud['credit_risk_score']= pd_fraud['credit_risk_score'].astype(float)
Separación en train y test estratificado¶
import plotly.express as px
pd_plot_fraud_bool = pd_fraud['fraud_bool']\
.value_counts(normalize=True)\
.mul(100).rename('percent').reset_index()
pd_plot_fraud_bool_conteo = pd_fraud['fraud_bool'].value_counts().reset_index()
pd_plot_fraud_bool_pc = pd.merge(pd_plot_fraud_bool,
pd_plot_fraud_bool_conteo, on=['fraud_bool'], how='inner')
fig = px.histogram(pd_plot_fraud_bool_pc, x='fraud_bool', y='percent', labels={'fraud_bool': 'Fraud Status', 'percent': 'Percentage'})
fig.show()
De nuevo, obtenemos el histograma del notebook 01
from sklearn.model_selection import train_test_split
X_pd_fraud, X_pd_fraud_test, y_pd_fraud, y_pd_fraud_test = train_test_split(pd_fraud.drop('fraud_bool',axis=1),
pd_fraud['fraud_bool'],
stratify=pd_fraud['fraud_bool'],
test_size=0.2)
pd_fraud_train = pd.concat([X_pd_fraud, y_pd_fraud],axis=1)
pd_fraud_test = pd.concat([X_pd_fraud_test, y_pd_fraud_test],axis=1)
C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\sklearn\utils\validation.py:605: FutureWarning: is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\sklearn\utils\validation.py:614: FutureWarning: is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
print('== Train\n', pd_fraud_train['fraud_bool'].value_counts(normalize=True))
print('== Test\n', pd_fraud_test['fraud_bool'].value_counts(normalize=True))
== Train fraud_bool 0 0.988971 1 0.011029 Name: proportion, dtype: float64 == Test fraud_bool 0 0.98897 1 0.01103 Name: proportion, dtype: float64
Se observa la división entre train y test para la variable objetivo de nuestro dataset. Vemos que los datos de train y test son los mismos porcentajes pero esto no quiere decir que sea el mismo número de registros (normalize=True). Esto se hace por si la variable objetivo está desbalanceada se haga el corte entre el train y el test equilibrado.
Visualización descriptiva de los datos¶
Distribución de las variables
pd_fraud_train.columns
Index(['income', 'name_email_similarity', 'prev_address_months_count',
'current_address_months_count', 'customer_age', 'days_since_request',
'intended_balcon_amount', 'payment_type', 'zip_count_4w', 'velocity_6h',
'velocity_24h', 'velocity_4w', 'bank_branch_count_8w',
'date_of_birth_distinct_emails_4w', 'employment_status',
'credit_risk_score', 'email_is_free', 'housing_status',
'phone_home_valid', 'phone_mobile_valid', 'bank_months_count',
'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source',
'session_length_in_minutes', 'device_os', 'keep_alive_session',
'device_distinct_emails_8w', 'device_fraud_count', 'month',
'fraud_bool'],
dtype='object')
Creamos dos variables que recogen por un lado, las variables categóricas del dataframe (lista_variables_categoricas), y por otro lado, las variables numéricas tanto enteras como float como binarias (lista_variables_numericas)
lista_variables_categoricas = ["payment_type","employment_status","housing_status","email_is_free","phone_home_valid",
"phone_mobile_valid","has_other_cards","foreign_request","source","device_os","keep_alive_session", "fraud_bool"]
lista_variables_numericas = ["income", "name_email_similarity", "prev_address_months_count", "current_address_months_count", "customer_age", "days_since_request",
"intended_balcon_amount", "zip_count_4w", "velocity_6h", "velocity_24h", "velocity_4w", "bank_branch_count_8w", "date_of_birth_distinct_emails_4w",
"credit_risk_score", "bank_months_count", "proposed_credit_limit", "session_length_in_minutes", "device_distinct_emails_8w", "device_fraud_count",
"month"]
target = 'fraud_bool'
pd_fraud_train[target] = pd_fraud_train[target].astype(str)
La siguiente celda del código se utiliza para realizar visualizaciones de las características en el DataFrame pd_fraud_train. Este enfoque facilita obtener una comprensión rápida de cómo diferentes características se relacionan con la variable objetivo.
%%time
for i in list(pd_fraud_train.columns):
if (pd_fraud_train[i].dtype==float) & (i!='fraud_bool'):
plot_feature(pd_fraud_train, col_name=i, isContinuous=True, target='fraud_bool')
elif i!='fraud_bool':
plot_feature(pd_fraud_train, col_name=i, isContinuous=False, target='fraud_bool')
C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\AppData\Local\Temp\ipykernel_42004\2343862388.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\AppData\Local\Temp\ipykernel_42004\2343862388.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\AppData\Local\Temp\ipykernel_42004\2343862388.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\AppData\Local\Temp\ipykernel_42004\2343862388.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\AppData\Local\Temp\ipykernel_42004\2343862388.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\AppData\Local\Temp\ipykernel_42004\2343862388.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\AppData\Local\Temp\ipykernel_42004\2343862388.py:9: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). Consider using `matplotlib.pyplot.close()`. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\AppData\Local\Temp\ipykernel_42004\2343862388.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\AppData\Local\Temp\ipykernel_42004\2343862388.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\AppData\Local\Temp\ipykernel_42004\2343862388.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\AppData\Local\Temp\ipykernel_42004\2343862388.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\AppData\Local\Temp\ipykernel_42004\2343862388.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\sarim\anaconda3\envs\practicaEDA\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
CPU times: total: 21.7 s Wall time: 2min 46s
Comentarios de los gráficos¶
Comentarios de los gráficos más relevantes:
En el primer gráfico boxplot podemos ver que hay una diferencia clara entre las medias de 'income' cuando medimos los datos fraudulentos contra los no fraudulentos, siendo mucho mayor en los casos en los que se comete fraude. Esto puede llevar a pensar que las cuentas con más income tienden a ser fraudulentas. Un estudio más profundo sobre esta relación podría ser interesante.
Como podemos ver en el gráfico de barras de 'name_email_similarity', la mayoría de la gente pone su nombre en la dirección del email.
En el boxplot no hay diferencias significativas por lo que probablemente no tenga relación el nombre del email con la posibilidad de que las aplicaciones sean fraudulentas o no.
Observamos que los dos bigotes de la caja boxplot de la variable 'prev_address_months_count' indica que al menos el 50% de las aplicaciones fraudulentas presentan esta variable como valor faltante. Esta variable podría ser relevante para detectar los fraudes.
El gráfico de barras de la variable 'customer_age' muestra un dato muy relevante: que se utilizan cuentas de personas más mayores para generar aplicaciones fraudulentas.
En el histograma de 'intended_balcon_amount' podemos ver que casi nunca se introduce dinero al iniciar una aplicación. Eso sumado a que los gráficos boxplots son muy parecidos, indica que esta variable no sería muy útil para el estudio.
En el gráfico de barras de 'date_of_birth_distinct_emails_4w' observamos que hay 4 semanas dignas de mención (0,1,2,36). Un estudio más a fondo podría ser interesante.
A pesar de que en el histograma de 'employment_status' se observen que el que más se repiten son los CA y que los grupos CC y en especial CG apenas carecen de aplicaciones, en el gráfico de barras observamos que el employment_status CC y CG tiende a ser utilizado para crear aplicaciones fraudulentas.
A pesar de que en el histograma de 'housing_status' se observen que el que más se repiten son los BC, en el gráfico de barras observamos que el housing_status BA tiende a ser más utilizado para crear aplicaciones fraudulentas.
En el gráfico de barras de 'bank_months_count' observamos que los usuarios crean más cuentas fraudulentas cuando la cuenta bancaria de un cliente tiene 17 meses.
En el boxplot de 'proposed_credit_limit' se muestra que las aplicaciones fraudulentas se tienden a crear con un límite de cuenta mayor, lo que tiene lógica puesto que se quiere obtener confianza.
En el histograma de 'foreign_request' podemos ver que no hay muchas cuentas con esta variable igual a 1, y en el gráfico de barras entonces podemos observar una ligera diferencia. Cuentas con este valor se pueden llegar a utilizar más para crear aplicaciones fraudulentas.
Gráficos keep_alive_session: Las cuentas con aplicaciones fraudulentas tienden a dejarse la cuenta abierta lo cual tiene sentido porque así no se requiere introducir contraseñas.
Las aplicaciones fraudulentas es probable que se creen desde el mismo dispositivo, y esto puede explicar que en el gráfico de barras de 'device_distinct_emails_8w' el valor que más se repita sea el 2.
No se aprecian diferencias significativas en los gráficos boxplot ni información relevante en los histogramas: 'payment_type', 'email_is_free', 'phone_home_valid', 'phone_mobile_valid', 'has_other_cards', 'source', 'device_os', 'current_adress_months_count', 'days_since_request', 'zip_count_4w', 'velocity_6h', 'velocity_24h', 'velocity_4w', 'bank_branch_count_8w', 'credit_risk_score', 'session_length_in_minutes', 'device_fraud_count'y 'month'.
- Tratamiento de las variables numericas¶
A continuación, se tratan los valores missing, las correlaciones de las variables numericas y los outlier
lista_variables_numericas
['income', 'name_email_similarity', 'prev_address_months_count', 'current_address_months_count', 'customer_age', 'days_since_request', 'intended_balcon_amount', 'zip_count_4w', 'velocity_6h', 'velocity_24h', 'velocity_4w', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score', 'bank_months_count', 'proposed_credit_limit', 'session_length_in_minutes', 'device_distinct_emails_8w', 'device_fraud_count', 'month']
Tratamiento de outliers¶
Los valores outlier se pueden sustituir por la media, mediana, valores extremos (media+3std o media-3std). Tras el siguiente análisis, hemos decidido como primera iteración dejarlos sin sustituir. Una vez lleguemos al modelo puedo realizar iteraciones utilizando diferentes métodos para comprobar si mejora el modelo.
pd_fraud_train
| income | name_email_similarity | prev_address_months_count | current_address_months_count | customer_age | days_since_request | intended_balcon_amount | payment_type | zip_count_4w | velocity_6h | velocity_24h | velocity_4w | bank_branch_count_8w | date_of_birth_distinct_emails_4w | employment_status | credit_risk_score | email_is_free | housing_status | phone_home_valid | phone_mobile_valid | bank_months_count | has_other_cards | proposed_credit_limit | foreign_request | source | session_length_in_minutes | device_os | keep_alive_session | device_distinct_emails_8w | device_fraud_count | month | fraud_bool | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 844931 | 0.2 | 0.141930 | -1.0 | 40.0 | 50 | 0.024422 | -1.412505 | AB | 2504.0 | 8838.670678 | 5048.335872 | 4263.532921 | 4.0 | 7 | CA | 125.0 | 1 | BC | 0 | 1 | 20 | 1 | 200.0 | 0 | INTERNET | 12.867456 | other | 1 | 1 | 0 | 5 | 0 |
| 754426 | 0.3 | 0.346878 | 31.0 | 11.0 | 30 | 0.011844 | 52.591384 | AA | 738.0 | 2572.523408 | 7816.818383 | 4218.950926 | 11.0 | 9 | CA | 82.0 | 1 | BC | 0 | 1 | 11 | 0 | 200.0 | 0 | INTERNET | 5.815533 | linux | 1 | 1 | 0 | 5 | 0 |
| 715567 | 0.1 | 0.843095 | -1.0 | 206.0 | 50 | 0.774946 | 49.445788 | AA | 4790.0 | 7852.149227 | 4281.170377 | 6771.263190 | 1354.0 | 4 | CB | 67.0 | 1 | BC | 0 | 1 | 25 | 0 | 200.0 | 0 | INTERNET | 17.412355 | other | 1 | 1 | 0 | 0 | 0 |
| 850530 | 0.1 | 0.152189 | 25.0 | 1.0 | 20 | 68.075714 | -0.598303 | AC | 717.0 | 4710.489115 | 5830.846361 | 4379.146843 | 1.0 | 11 | CA | 88.0 | 1 | BE | 0 | 1 | -1 | 0 | 200.0 | 0 | INTERNET | 10.980653 | linux | 1 | 1 | 0 | 5 | 0 |
| 916672 | 0.4 | 0.998324 | -1.0 | 35.0 | 50 | 0.004613 | -1.547372 | AB | 1077.0 | 6930.311123 | 5181.562318 | 4176.094149 | 22.0 | 2 | CB | 90.0 | 0 | BC | 0 | 1 | 3 | 1 | 200.0 | 0 | INTERNET | 14.104151 | other | 0 | 1 | 0 | 4 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 792889 | 0.1 | 0.730265 | 54.0 | 16.0 | 20 | 0.018541 | -1.055121 | AC | 566.0 | 6503.467682 | 3032.637862 | 4257.734586 | 1904.0 | 9 | CA | 177.0 | 0 | BC | 0 | 1 | -1 | 0 | 500.0 | 0 | INTERNET | 12.025350 | windows | 1 | 1 | 0 | 5 | 0 |
| 769034 | 0.2 | 0.429537 | 27.0 | 13.0 | 60 | 1.175316 | -1.259155 | AC | 849.0 | 2704.249089 | 4915.168946 | 4313.915365 | 0.0 | 6 | CC | 143.0 | 1 | BC | 1 | 1 | -1 | 0 | 200.0 | 0 | INTERNET | 18.759694 | linux | 0 | 1 | 0 | 5 | 0 |
| 405102 | 0.1 | 0.244073 | -1.0 | 283.0 | 40 | 3.850229 | -0.497876 | AC | 1645.0 | 3688.615016 | 4846.934917 | 5431.954408 | 1.0 | 6 | CA | 193.0 | 1 | BA | 0 | 1 | -1 | 0 | 200.0 | 0 | INTERNET | 4.291687 | macintosh | 0 | 1 | 0 | 1 | 0 |
| 529726 | 0.8 | 0.510365 | -1.0 | 149.0 | 20 | 0.003480 | -0.489399 | AD | 1957.0 | 1791.236767 | 2841.291183 | 4275.514597 | 15.0 | 15 | CA | 184.0 | 1 | BB | 0 | 1 | 10 | 0 | 1500.0 | 0 | INTERNET | 8.485615 | windows | 0 | 1 | 0 | 6 | 0 |
| 816585 | 0.9 | 0.232059 | -1.0 | 65.0 | 30 | 0.002175 | -0.675794 | AB | 849.0 | 809.496910 | 2628.615638 | 4179.454984 | 16.0 | 11 | CA | 98.0 | 1 | BC | 0 | 1 | 1 | 0 | 200.0 | 0 | INTERNET | 1.871505 | windows | 1 | 1 | 0 | 5 | 0 |
800000 rows × 32 columns
get_deviation_of_mean_perc(pd_fraud_train, lista_variables_numericas, target='fraud_bool', multiplier=3)
| 0 | 1 | variable | sum_outlier_values | porcentaje_sum_null_values | |
|---|---|---|---|---|---|
| 0 | 0.993412 | 0.006588 | prev_address_months_count | 20189 | 0.025236 |
| 1 | 0.983869 | 0.016131 | current_address_months_count | 17172 | 0.021465 |
| 2 | 0.956831 | 0.043169 | customer_age | 6324 | 0.007905 |
| 3 | 0.987552 | 0.012448 | days_since_request | 14300 | 0.017875 |
| 4 | 0.990352 | 0.009648 | intended_balcon_amount | 15133 | 0.018916 |
| 5 | 0.990161 | 0.009839 | zip_count_4w | 13010 | 0.016262 |
| 6 | 0.992777 | 0.007223 | velocity_6h | 3461 | 0.004326 |
| 7 | 0.997701 | 0.002299 | velocity_24h | 435 | 0.000544 |
| 8 | 0.989898 | 0.010102 | bank_branch_count_8w | 32767 | 0.040959 |
| 9 | 0.993182 | 0.006818 | date_of_birth_distinct_emails_4w | 4987 | 0.006234 |
| 10 | 0.965901 | 0.034099 | credit_risk_score | 2962 | 0.003703 |
| 11 | 0.869654 | 0.130346 | proposed_credit_limit | 4910 | 0.006137 |
| 12 | 0.980115 | 0.019885 | session_length_in_minutes | 18858 | 0.023572 |
| 13 | 0.963353 | 0.036647 | device_distinct_emails_8w | 25514 | 0.031892 |
La función anterior calcula la desviación de la media para cada variable numerica con respecto a la variable objetivo (fraud_bool) y, en función del multiplicador, identifica los outliers. Este enfoque es común en la detección de outliers para resaltar observaciones que tienen un gran impacto en el comportamiento de la variable objetivo con las variables numéricas.
Se puede observar que en la variable proposed_credit_limit los outliers tienen un mayor porcentaje (0.13213) de fraud_bool=1 (solicitud fraudulenta).
Correlaciones¶
get_corr_matrix(dataset = pd_fraud_train[lista_variables_numericas],
metodo='pearson', size_figure=[10,8])
0
- De la información más llamativa del gráfico superior es que las velocidades "velocity_24h" y "velocity_4w" y las variables 'credit_risk_score' y 'proposed_credit_limit' son las que poseen un mayor grado de correlación positiva entre sí (alrededor de un 60%)
- Además podemos destacar que, en valor absoluto, las variables más correlacionadas de la matriz superior son 'month' y 'velocity_4w' con un 84,80% de correlación.
corr = pd_fraud_train[lista_variables_numericas].corr('pearson')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k=-1) # below main lower triangle of an array
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation']>0.5]
| level_0 | level_1 | correlation | |
|---|---|---|---|
| 372 | month | velocity_4w | 0.848313 |
| 313 | proposed_credit_limit | credit_risk_score | 0.605689 |
| 371 | month | velocity_24h | 0.549924 |
| 209 | velocity_4w | velocity_24h | 0.538996 |
Comprobamos las variables que tienen una correlación mayor a un 50%.
new_corr
| level_0 | level_1 | correlation | |
|---|---|---|---|
| 372 | month | velocity_4w | 0.848313 |
| 313 | proposed_credit_limit | credit_risk_score | 0.605689 |
| 371 | month | velocity_24h | 0.549924 |
| 209 | velocity_4w | velocity_24h | 0.538996 |
| 188 | velocity_24h | velocity_6h | 0.464394 |
| 244 | date_of_birth_distinct_emails_4w | customer_age | 0.420340 |
| 370 | month | velocity_6h | 0.409642 |
| 208 | velocity_4w | velocity_6h | 0.400591 |
| 207 | velocity_4w | zip_count_4w | 0.302428 |
| 369 | month | zip_count_4w | 0.286610 |
| 62 | current_address_months_count | prev_address_months_count | 0.271796 |
| 374 | month | date_of_birth_distinct_emails_4w | 0.244577 |
| 250 | date_of_birth_distinct_emails_4w | velocity_4w | 0.242238 |
| 187 | velocity_24h | zip_count_4w | 0.201044 |
| 286 | bank_months_count | intended_balcon_amount | 0.178036 |
| 375 | month | credit_risk_score | 0.176031 |
| 243 | date_of_birth_distinct_emails_4w | current_address_months_count | 0.174769 |
| 260 | credit_risk_score | income | 0.171589 |
| 270 | credit_risk_score | velocity_4w | 0.168914 |
| 264 | credit_risk_score | customer_age | 0.165410 |
| 249 | date_of_birth_distinct_emails_4w | velocity_24h | 0.155087 |
| 304 | proposed_credit_limit | customer_age | 0.147791 |
| 269 | credit_risk_score | velocity_24h | 0.146625 |
| 268 | credit_risk_score | velocity_6h | 0.145023 |
| 83 | customer_age | current_address_months_count | 0.143012 |
| 167 | velocity_6h | zip_count_4w | 0.142283 |
| 291 | bank_months_count | bank_branch_count_8w | 0.142247 |
| 272 | credit_risk_score | date_of_birth_distinct_emails_4w | 0.136189 |
| 303 | proposed_credit_limit | current_address_months_count | 0.130970 |
| 247 | date_of_birth_distinct_emails_4w | zip_count_4w | 0.126291 |
| 80 | customer_age | income | 0.125631 |
| 248 | date_of_birth_distinct_emails_4w | velocity_6h | 0.118453 |
| 362 | month | income | 0.116409 |
| 300 | proposed_credit_limit | income | 0.109077 |
| 200 | velocity_4w | income | 0.106146 |
| 226 | bank_branch_count_8w | intended_balcon_amount | 0.103173 |
| 267 | credit_risk_score | zip_count_4w | 0.102343 |
| 180 | velocity_24h | income | 0.101141 |
| 160 | velocity_6h | income | 0.096158 |
| 263 | credit_risk_score | current_address_months_count | 0.096093 |
| 265 | credit_risk_score | days_since_request | 0.087627 |
| 140 | zip_count_4w | income | 0.081503 |
| 378 | month | session_length_in_minutes | 0.079189 |
| 102 | days_since_request | prev_address_months_count | 0.078838 |
| 330 | session_length_in_minutes | velocity_4w | 0.078522 |
| 356 | device_distinct_emails_8w | session_length_in_minutes | 0.077812 |
| 301 | proposed_credit_limit | name_email_similarity | 0.072721 |
| 283 | bank_months_count | current_address_months_count | 0.071724 |
| 82 | customer_age | prev_address_months_count | 0.069110 |
| 305 | proposed_credit_limit | days_since_request | 0.068889 |
| 329 | session_length_in_minutes | velocity_24h | 0.066610 |
| 186 | velocity_24h | intended_balcon_amount | 0.064896 |
| 240 | date_of_birth_distinct_emails_4w | income | 0.062788 |
| 293 | bank_months_count | credit_risk_score | 0.061442 |
| 242 | date_of_birth_distinct_emails_4w | prev_address_months_count | 0.059415 |
| 120 | intended_balcon_amount | income | 0.059179 |
| 306 | proposed_credit_limit | intended_balcon_amount | 0.058652 |
| 103 | days_since_request | current_address_months_count | 0.058019 |
| 123 | intended_balcon_amount | current_address_months_count | 0.057536 |
| 377 | month | proposed_credit_limit | 0.055932 |
| 320 | session_length_in_minutes | income | 0.055135 |
| 328 | session_length_in_minutes | velocity_6h | 0.054910 |
| 223 | bank_branch_count_8w | current_address_months_count | 0.054364 |
| 287 | bank_months_count | zip_count_4w | 0.053895 |
| 322 | session_length_in_minutes | prev_address_months_count | 0.052249 |
| 206 | velocity_4w | intended_balcon_amount | 0.051628 |
| 310 | proposed_credit_limit | velocity_4w | 0.051509 |
| 61 | current_address_months_count | name_email_similarity | 0.049945 |
| 325 | session_length_in_minutes | days_since_request | 0.049724 |
| 81 | customer_age | name_email_similarity | 0.049005 |
| 261 | credit_risk_score | name_email_similarity | 0.048905 |
| 224 | bank_branch_count_8w | customer_age | 0.048822 |
| 143 | zip_count_4w | current_address_months_count | 0.047059 |
| 379 | month | device_distinct_emails_8w | 0.046563 |
| 282 | bank_months_count | prev_address_months_count | 0.045022 |
| 312 | proposed_credit_limit | date_of_birth_distinct_emails_4w | 0.044172 |
| 368 | month | intended_balcon_amount | 0.043309 |
| 166 | velocity_6h | intended_balcon_amount | 0.042621 |
| 350 | device_distinct_emails_8w | velocity_4w | 0.042159 |
| 363 | month | name_email_similarity | 0.042064 |
| 353 | device_distinct_emails_8w | credit_risk_score | 0.039984 |
| 222 | bank_branch_count_8w | prev_address_months_count | 0.039963 |
| 201 | velocity_4w | name_email_similarity | 0.039289 |
| 327 | session_length_in_minutes | zip_count_4w | 0.039056 |
| 20 | name_email_similarity | income | 0.038322 |
| 251 | date_of_birth_distinct_emails_4w | bank_branch_count_8w | 0.037826 |
| 60 | current_address_months_count | income | 0.037008 |
| 104 | days_since_request | customer_age | 0.036593 |
| 229 | bank_branch_count_8w | velocity_24h | 0.036085 |
| 349 | device_distinct_emails_8w | velocity_24h | 0.035156 |
| 181 | velocity_24h | name_email_similarity | 0.034921 |
| 333 | session_length_in_minutes | credit_risk_score | 0.033919 |
| 323 | session_length_in_minutes | current_address_months_count | 0.033852 |
| 290 | bank_months_count | velocity_4w | 0.032986 |
| 230 | bank_branch_count_8w | velocity_4w | 0.031086 |
| 348 | device_distinct_emails_8w | velocity_6h | 0.030284 |
| 142 | zip_count_4w | prev_address_months_count | 0.030105 |
| 332 | session_length_in_minutes | date_of_birth_distinct_emails_4w | 0.029953 |
| 121 | intended_balcon_amount | name_email_similarity | 0.029722 |
| 373 | month | bank_branch_count_8w | 0.029136 |
| 302 | proposed_credit_limit | prev_address_months_count | 0.028508 |
| 165 | velocity_6h | days_since_request | 0.028413 |
| 246 | date_of_birth_distinct_emails_4w | intended_balcon_amount | 0.027304 |
| 163 | velocity_6h | current_address_months_count | 0.027157 |
| 285 | bank_months_count | days_since_request | 0.026881 |
| 344 | device_distinct_emails_8w | customer_age | 0.025816 |
| 203 | velocity_4w | current_address_months_count | 0.025462 |
| 324 | session_length_in_minutes | customer_age | 0.025380 |
| 376 | month | bank_months_count | 0.025238 |
| 308 | proposed_credit_limit | velocity_6h | 0.024832 |
| 185 | velocity_24h | days_since_request | 0.024582 |
| 141 | zip_count_4w | name_email_similarity | 0.024267 |
| 161 | velocity_6h | name_email_similarity | 0.023921 |
| 365 | month | current_address_months_count | 0.023767 |
| 314 | proposed_credit_limit | bank_months_count | 0.023519 |
| 241 | date_of_birth_distinct_emails_4w | name_email_similarity | 0.022680 |
| 326 | session_length_in_minutes | intended_balcon_amount | 0.022495 |
| 164 | velocity_6h | customer_age | 0.021376 |
| 125 | intended_balcon_amount | days_since_request | 0.020721 |
| 347 | device_distinct_emails_8w | zip_count_4w | 0.020567 |
| 228 | bank_branch_count_8w | velocity_6h | 0.020324 |
| 289 | bank_months_count | velocity_24h | 0.019575 |
| 346 | device_distinct_emails_8w | intended_balcon_amount | 0.019545 |
| 40 | prev_address_months_count | income | 0.018854 |
| 245 | date_of_birth_distinct_emails_4w | days_since_request | 0.018638 |
| 41 | prev_address_months_count | name_email_similarity | 0.018376 |
| 225 | bank_branch_count_8w | days_since_request | 0.018301 |
| 183 | velocity_24h | current_address_months_count | 0.018197 |
| 205 | velocity_4w | days_since_request | 0.018068 |
| 367 | month | days_since_request | 0.018065 |
| 355 | device_distinct_emails_8w | proposed_credit_limit | 0.017836 |
| 341 | device_distinct_emails_8w | name_email_similarity | 0.017772 |
| 334 | session_length_in_minutes | bank_months_count | 0.017603 |
| 309 | proposed_credit_limit | velocity_24h | 0.016560 |
| 262 | credit_risk_score | prev_address_months_count | 0.016226 |
| 100 | days_since_request | income | 0.015583 |
| 288 | bank_months_count | velocity_6h | 0.014830 |
| 271 | credit_risk_score | bank_branch_count_8w | 0.014621 |
| 144 | zip_count_4w | customer_age | 0.014371 |
| 124 | intended_balcon_amount | customer_age | 0.012997 |
| 284 | bank_months_count | customer_age | 0.012915 |
| 307 | proposed_credit_limit | zip_count_4w | 0.012259 |
| 342 | device_distinct_emails_8w | prev_address_months_count | 0.011195 |
| 184 | velocity_24h | customer_age | 0.010147 |
| 311 | proposed_credit_limit | bank_branch_count_8w | 0.009942 |
| 340 | device_distinct_emails_8w | income | 0.009809 |
| 122 | intended_balcon_amount | prev_address_months_count | 0.009186 |
| 182 | velocity_24h | prev_address_months_count | 0.008872 |
| 352 | device_distinct_emails_8w | date_of_birth_distinct_emails_4w | 0.008845 |
| 220 | bank_branch_count_8w | income | 0.008654 |
| 292 | bank_months_count | date_of_birth_distinct_emails_4w | 0.008329 |
| 345 | device_distinct_emails_8w | days_since_request | 0.007579 |
| 101 | days_since_request | name_email_similarity | 0.006781 |
| 281 | bank_months_count | name_email_similarity | 0.006740 |
| 321 | session_length_in_minutes | name_email_similarity | 0.006329 |
| 366 | month | customer_age | 0.005566 |
| 202 | velocity_4w | prev_address_months_count | 0.005520 |
| 227 | bank_branch_count_8w | zip_count_4w | 0.005323 |
| 351 | device_distinct_emails_8w | bank_branch_count_8w | 0.004985 |
| 331 | session_length_in_minutes | bank_branch_count_8w | 0.004661 |
| 204 | velocity_4w | customer_age | 0.004401 |
| 280 | bank_months_count | income | 0.003953 |
| 354 | device_distinct_emails_8w | bank_months_count | 0.003507 |
| 266 | credit_risk_score | intended_balcon_amount | 0.002284 |
| 162 | velocity_6h | prev_address_months_count | 0.001836 |
| 146 | zip_count_4w | intended_balcon_amount | 0.001818 |
| 221 | bank_branch_count_8w | name_email_similarity | 0.001801 |
| 364 | month | prev_address_months_count | 0.001741 |
| 343 | device_distinct_emails_8w | current_address_months_count | 0.000449 |
| 145 | zip_count_4w | days_since_request | 0.000160 |
| 335 | session_length_in_minutes | proposed_credit_limit | 0.000034 |
| 235 | bank_branch_count_8w | proposed_credit_limit | 0.000000 |
| 234 | bank_branch_count_8w | bank_months_count | 0.000000 |
| 233 | bank_branch_count_8w | credit_risk_score | 0.000000 |
| 236 | bank_branch_count_8w | session_length_in_minutes | 0.000000 |
| 232 | bank_branch_count_8w | date_of_birth_distinct_emails_4w | 0.000000 |
| 231 | bank_branch_count_8w | bank_branch_count_8w | 0.000000 |
| 237 | bank_branch_count_8w | device_distinct_emails_8w | 0.000000 |
| 238 | bank_branch_count_8w | device_fraud_count | 0.000000 |
| 0 | income | income | 0.000000 |
| 239 | bank_branch_count_8w | month | 0.000000 |
| 336 | session_length_in_minutes | session_length_in_minutes | 0.000000 |
| 298 | bank_months_count | device_fraud_count | 0.000000 |
| 299 | bank_months_count | month | 0.000000 |
| 315 | proposed_credit_limit | proposed_credit_limit | 0.000000 |
| 316 | proposed_credit_limit | session_length_in_minutes | 0.000000 |
| 317 | proposed_credit_limit | device_distinct_emails_8w | 0.000000 |
| 318 | proposed_credit_limit | device_fraud_count | 0.000000 |
| 319 | proposed_credit_limit | month | 0.000000 |
| 337 | session_length_in_minutes | device_distinct_emails_8w | 0.000000 |
| 252 | date_of_birth_distinct_emails_4w | date_of_birth_distinct_emails_4w | 0.000000 |
| 338 | session_length_in_minutes | device_fraud_count | 0.000000 |
| 339 | session_length_in_minutes | month | 0.000000 |
| 357 | device_distinct_emails_8w | device_distinct_emails_8w | 0.000000 |
| 358 | device_distinct_emails_8w | device_fraud_count | 0.000000 |
| 359 | device_distinct_emails_8w | month | 0.000000 |
| 360 | device_fraud_count | device_fraud_count | 0.000000 |
| 361 | device_fraud_count | month | 0.000000 |
| 297 | bank_months_count | device_distinct_emails_8w | 0.000000 |
| 296 | bank_months_count | session_length_in_minutes | 0.000000 |
| 295 | bank_months_count | proposed_credit_limit | 0.000000 |
| 294 | bank_months_count | bank_months_count | 0.000000 |
| 253 | date_of_birth_distinct_emails_4w | credit_risk_score | 0.000000 |
| 254 | date_of_birth_distinct_emails_4w | bank_months_count | 0.000000 |
| 255 | date_of_birth_distinct_emails_4w | proposed_credit_limit | 0.000000 |
| 256 | date_of_birth_distinct_emails_4w | session_length_in_minutes | 0.000000 |
| 257 | date_of_birth_distinct_emails_4w | device_distinct_emails_8w | 0.000000 |
| 258 | date_of_birth_distinct_emails_4w | device_fraud_count | 0.000000 |
| 259 | date_of_birth_distinct_emails_4w | month | 0.000000 |
| 273 | credit_risk_score | credit_risk_score | 0.000000 |
| 274 | credit_risk_score | bank_months_count | 0.000000 |
| 275 | credit_risk_score | proposed_credit_limit | 0.000000 |
| 276 | credit_risk_score | session_length_in_minutes | 0.000000 |
| 277 | credit_risk_score | device_distinct_emails_8w | 0.000000 |
| 278 | credit_risk_score | device_fraud_count | 0.000000 |
| 279 | credit_risk_score | month | 0.000000 |
| 218 | velocity_4w | device_fraud_count | 0.000000 |
| 219 | velocity_4w | month | 0.000000 |
| 190 | velocity_24h | velocity_4w | 0.000000 |
| 217 | velocity_4w | device_distinct_emails_8w | 0.000000 |
| 44 | prev_address_months_count | customer_age | 0.000000 |
| 65 | current_address_months_count | days_since_request | 0.000000 |
| 64 | current_address_months_count | customer_age | 0.000000 |
| 63 | current_address_months_count | current_address_months_count | 0.000000 |
| 59 | prev_address_months_count | month | 0.000000 |
| 58 | prev_address_months_count | device_fraud_count | 0.000000 |
| 57 | prev_address_months_count | device_distinct_emails_8w | 0.000000 |
| 56 | prev_address_months_count | session_length_in_minutes | 0.000000 |
| 55 | prev_address_months_count | proposed_credit_limit | 0.000000 |
| 54 | prev_address_months_count | bank_months_count | 0.000000 |
| 53 | prev_address_months_count | credit_risk_score | 0.000000 |
| 52 | prev_address_months_count | date_of_birth_distinct_emails_4w | 0.000000 |
| 51 | prev_address_months_count | bank_branch_count_8w | 0.000000 |
| 50 | prev_address_months_count | velocity_4w | 0.000000 |
| 49 | prev_address_months_count | velocity_24h | 0.000000 |
| 48 | prev_address_months_count | velocity_6h | 0.000000 |
| 47 | prev_address_months_count | zip_count_4w | 0.000000 |
| 46 | prev_address_months_count | intended_balcon_amount | 0.000000 |
| 66 | current_address_months_count | intended_balcon_amount | 0.000000 |
| 67 | current_address_months_count | zip_count_4w | 0.000000 |
| 68 | current_address_months_count | velocity_6h | 0.000000 |
| 78 | current_address_months_count | device_fraud_count | 0.000000 |
| 89 | customer_age | velocity_24h | 0.000000 |
| 88 | customer_age | velocity_6h | 0.000000 |
| 87 | customer_age | zip_count_4w | 0.000000 |
| 86 | customer_age | intended_balcon_amount | 0.000000 |
| 85 | customer_age | days_since_request | 0.000000 |
| 84 | customer_age | customer_age | 0.000000 |
| 79 | current_address_months_count | month | 0.000000 |
| 77 | current_address_months_count | device_distinct_emails_8w | 0.000000 |
| 69 | current_address_months_count | velocity_24h | 0.000000 |
| 76 | current_address_months_count | session_length_in_minutes | 0.000000 |
| 75 | current_address_months_count | proposed_credit_limit | 0.000000 |
| 74 | current_address_months_count | bank_months_count | 0.000000 |
| 73 | current_address_months_count | credit_risk_score | 0.000000 |
| 72 | current_address_months_count | date_of_birth_distinct_emails_4w | 0.000000 |
| 71 | current_address_months_count | bank_branch_count_8w | 0.000000 |
| 70 | current_address_months_count | velocity_4w | 0.000000 |
| 45 | prev_address_months_count | days_since_request | 0.000000 |
| 43 | prev_address_months_count | current_address_months_count | 0.000000 |
| 216 | velocity_4w | session_length_in_minutes | 0.000000 |
| 42 | prev_address_months_count | prev_address_months_count | 0.000000 |
| 18 | income | device_fraud_count | 0.000000 |
| 17 | income | device_distinct_emails_8w | 0.000000 |
| 16 | income | session_length_in_minutes | 0.000000 |
| 15 | income | proposed_credit_limit | 0.000000 |
| 14 | income | bank_months_count | 0.000000 |
| 13 | income | credit_risk_score | 0.000000 |
| 12 | income | date_of_birth_distinct_emails_4w | 0.000000 |
| 11 | income | bank_branch_count_8w | 0.000000 |
| 10 | income | velocity_4w | 0.000000 |
| 9 | income | velocity_24h | 0.000000 |
| 8 | income | velocity_6h | 0.000000 |
| 7 | income | zip_count_4w | 0.000000 |
| 6 | income | intended_balcon_amount | 0.000000 |
| 5 | income | days_since_request | 0.000000 |
| 4 | income | customer_age | 0.000000 |
| 3 | income | current_address_months_count | 0.000000 |
| 2 | income | prev_address_months_count | 0.000000 |
| 19 | income | month | 0.000000 |
| 21 | name_email_similarity | name_email_similarity | 0.000000 |
| 22 | name_email_similarity | prev_address_months_count | 0.000000 |
| 32 | name_email_similarity | date_of_birth_distinct_emails_4w | 0.000000 |
| 39 | name_email_similarity | month | 0.000000 |
| 38 | name_email_similarity | device_fraud_count | 0.000000 |
| 37 | name_email_similarity | device_distinct_emails_8w | 0.000000 |
| 36 | name_email_similarity | session_length_in_minutes | 0.000000 |
| 35 | name_email_similarity | proposed_credit_limit | 0.000000 |
| 34 | name_email_similarity | bank_months_count | 0.000000 |
| 33 | name_email_similarity | credit_risk_score | 0.000000 |
| 31 | name_email_similarity | bank_branch_count_8w | 0.000000 |
| 23 | name_email_similarity | current_address_months_count | 0.000000 |
| 30 | name_email_similarity | velocity_4w | 0.000000 |
| 29 | name_email_similarity | velocity_24h | 0.000000 |
| 28 | name_email_similarity | velocity_6h | 0.000000 |
| 27 | name_email_similarity | zip_count_4w | 0.000000 |
| 26 | name_email_similarity | intended_balcon_amount | 0.000000 |
| 25 | name_email_similarity | days_since_request | 0.000000 |
| 24 | name_email_similarity | customer_age | 0.000000 |
| 90 | customer_age | velocity_4w | 0.000000 |
| 91 | customer_age | bank_branch_count_8w | 0.000000 |
| 92 | customer_age | date_of_birth_distinct_emails_4w | 0.000000 |
| 93 | customer_age | credit_risk_score | 0.000000 |
| 176 | velocity_6h | session_length_in_minutes | 0.000000 |
| 175 | velocity_6h | proposed_credit_limit | 0.000000 |
| 174 | velocity_6h | bank_months_count | 0.000000 |
| 173 | velocity_6h | credit_risk_score | 0.000000 |
| 172 | velocity_6h | date_of_birth_distinct_emails_4w | 0.000000 |
| 171 | velocity_6h | bank_branch_count_8w | 0.000000 |
| 170 | velocity_6h | velocity_4w | 0.000000 |
| 169 | velocity_6h | velocity_24h | 0.000000 |
| 168 | velocity_6h | velocity_6h | 0.000000 |
| 159 | zip_count_4w | month | 0.000000 |
| 158 | zip_count_4w | device_fraud_count | 0.000000 |
| 157 | zip_count_4w | device_distinct_emails_8w | 0.000000 |
| 156 | zip_count_4w | session_length_in_minutes | 0.000000 |
| 155 | zip_count_4w | proposed_credit_limit | 0.000000 |
| 154 | zip_count_4w | bank_months_count | 0.000000 |
| 153 | zip_count_4w | credit_risk_score | 0.000000 |
| 152 | zip_count_4w | date_of_birth_distinct_emails_4w | 0.000000 |
| 177 | velocity_6h | device_distinct_emails_8w | 0.000000 |
| 178 | velocity_6h | device_fraud_count | 0.000000 |
| 179 | velocity_6h | month | 0.000000 |
| 198 | velocity_24h | device_fraud_count | 0.000000 |
| 215 | velocity_4w | proposed_credit_limit | 0.000000 |
| 214 | velocity_4w | bank_months_count | 0.000000 |
| 213 | velocity_4w | credit_risk_score | 0.000000 |
| 212 | velocity_4w | date_of_birth_distinct_emails_4w | 0.000000 |
| 211 | velocity_4w | bank_branch_count_8w | 0.000000 |
| 210 | velocity_4w | velocity_4w | 0.000000 |
| 199 | velocity_24h | month | 0.000000 |
| 197 | velocity_24h | device_distinct_emails_8w | 0.000000 |
| 189 | velocity_24h | velocity_24h | 0.000000 |
| 196 | velocity_24h | session_length_in_minutes | 0.000000 |
| 195 | velocity_24h | proposed_credit_limit | 0.000000 |
| 194 | velocity_24h | bank_months_count | 0.000000 |
| 193 | velocity_24h | credit_risk_score | 0.000000 |
| 192 | velocity_24h | date_of_birth_distinct_emails_4w | 0.000000 |
| 191 | velocity_24h | bank_branch_count_8w | 0.000000 |
| 1 | income | name_email_similarity | 0.000000 |
| 151 | zip_count_4w | bank_branch_count_8w | 0.000000 |
| 150 | zip_count_4w | velocity_4w | 0.000000 |
| 149 | zip_count_4w | velocity_24h | 0.000000 |
| 107 | days_since_request | zip_count_4w | 0.000000 |
| 114 | days_since_request | bank_months_count | 0.000000 |
| 113 | days_since_request | credit_risk_score | 0.000000 |
| 112 | days_since_request | date_of_birth_distinct_emails_4w | 0.000000 |
| 111 | days_since_request | bank_branch_count_8w | 0.000000 |
| 110 | days_since_request | velocity_4w | 0.000000 |
| 109 | days_since_request | velocity_24h | 0.000000 |
| 108 | days_since_request | velocity_6h | 0.000000 |
| 106 | days_since_request | intended_balcon_amount | 0.000000 |
| 116 | days_since_request | session_length_in_minutes | 0.000000 |
| 105 | days_since_request | days_since_request | 0.000000 |
| 99 | customer_age | month | 0.000000 |
| 98 | customer_age | device_fraud_count | 0.000000 |
| 97 | customer_age | device_distinct_emails_8w | 0.000000 |
| 96 | customer_age | session_length_in_minutes | 0.000000 |
| 95 | customer_age | proposed_credit_limit | 0.000000 |
| 94 | customer_age | bank_months_count | 0.000000 |
| 115 | days_since_request | proposed_credit_limit | 0.000000 |
| 117 | days_since_request | device_distinct_emails_8w | 0.000000 |
| 148 | zip_count_4w | velocity_6h | 0.000000 |
| 133 | intended_balcon_amount | credit_risk_score | 0.000000 |
| 147 | zip_count_4w | zip_count_4w | 0.000000 |
| 139 | intended_balcon_amount | month | 0.000000 |
| 138 | intended_balcon_amount | device_fraud_count | 0.000000 |
| 137 | intended_balcon_amount | device_distinct_emails_8w | 0.000000 |
| 136 | intended_balcon_amount | session_length_in_minutes | 0.000000 |
| 135 | intended_balcon_amount | proposed_credit_limit | 0.000000 |
| 134 | intended_balcon_amount | bank_months_count | 0.000000 |
| 132 | intended_balcon_amount | date_of_birth_distinct_emails_4w | 0.000000 |
| 118 | days_since_request | device_fraud_count | 0.000000 |
| 131 | intended_balcon_amount | bank_branch_count_8w | 0.000000 |
| 130 | intended_balcon_amount | velocity_4w | 0.000000 |
| 129 | intended_balcon_amount | velocity_24h | 0.000000 |
| 128 | intended_balcon_amount | velocity_6h | 0.000000 |
| 127 | intended_balcon_amount | zip_count_4w | 0.000000 |
| 126 | intended_balcon_amount | intended_balcon_amount | 0.000000 |
| 119 | days_since_request | month | 0.000000 |
| 380 | month | month | 0.000000 |
Decidimos en esta primera iteración, no eliminar correlaciones. Sin embargo, si finalmente aplicamos algún algoritmo que lo requiera, tendremos que eliminar correlaciones.
Tratamiento de valores nulos¶
lista_variables_numericas
['income', 'name_email_similarity', 'prev_address_months_count', 'current_address_months_count', 'customer_age', 'days_since_request', 'intended_balcon_amount', 'zip_count_4w', 'velocity_6h', 'velocity_24h', 'velocity_4w', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score', 'bank_months_count', 'proposed_credit_limit', 'session_length_in_minutes', 'device_distinct_emails_8w', 'device_fraud_count', 'month']
get_percent_null_values_target(pd_fraud_train, lista_variables_numericas, target='fraud_bool')
No existen variables con valores nulos
Como ya hemos adelantado en el primer notebook, los missings en este dataset no están codificados como NAs, si no como -1 o como valores negativos. Por esta razón, el código muestra el mensaje: 'No existen variables con valores nulos' a pesar de que todas las variables que contienen missings formen parte de 'lista_variables_numericas'.
- Tratamiento de las variables categoricas¶
lista_variables_categoricas
['payment_type', 'employment_status', 'housing_status', 'email_is_free', 'phone_home_valid', 'phone_mobile_valid', 'has_other_cards', 'foreign_request', 'source', 'device_os', 'keep_alive_session', 'fraud_bool']
Calculamos el coeficiente de Cramer para ver la relación entre una variable categórica y la variable objetivo. Escogemos dos variables aleatorias.
confusion_matrix = pd.crosstab(pd_fraud_train["fraud_bool"], pd_fraud_train["payment_type"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
payment_type AA AB AC AD AE fraud_bool 0 205500 293306 198169 93981 221 1 1097 3305 3389 1031 1
0.03929832455278903
Coeficiente de Cramer prácticamente nulo, prácticamente ninguna relación
confusion_matrix = pd.crosstab(pd_fraud_train["fraud_bool"], pd_fraud_train["fraud_bool"])
cramers_v(confusion_matrix.values)
0.9999426978916621
El resultado obtenido en esta línea de código tiene sentido puesto que la relación existente entre una variable consigo misma es de 1
confusion_matrix = pd.crosstab(pd_fraud_train["fraud_bool"], pd_fraud_train["date_of_birth_distinct_emails_4w"])
cramers_v(confusion_matrix.values)
0.06243118840944539
confusion_matrix = pd.crosstab(pd_fraud_train["fraud_bool"], pd_fraud_train["customer_age"])
cramers_v(confusion_matrix.values)
0.06708666095660588
confusion_matrix = pd.crosstab(pd_fraud_train["fraud_bool"], pd_fraud_train["employment_status"])
cramers_v(confusion_matrix.values)
0.03928772388675474
confusion_matrix = pd.crosstab(pd_fraud_train["fraud_bool"], pd_fraud_train["email_is_free"])
cramers_v(confusion_matrix.values)
0.028257098698935404
confusion_matrix = pd.crosstab(pd_fraud_train["fraud_bool"], pd_fraud_train["housing_status"])
cramers_v(confusion_matrix.values)
0.11509684153964615
confusion_matrix = pd.crosstab(pd_fraud_train["fraud_bool"], pd_fraud_train["phone_home_valid"])
cramers_v(confusion_matrix.values)
0.03555196601878687
confusion_matrix = pd.crosstab(pd_fraud_train["fraud_bool"], pd_fraud_train["phone_mobile_valid"])
cramers_v(confusion_matrix.values)
0.01245751012983334
confusion_matrix = pd.crosstab(pd_fraud_train["fraud_bool"], pd_fraud_train["bank_months_count"])
cramers_v(confusion_matrix.values)
0.05026673622693013
confusion_matrix = pd.crosstab(pd_fraud_train["fraud_bool"], pd_fraud_train["has_other_cards"])
cramers_v(confusion_matrix.values)
0.03555204314818798
confusion_matrix = pd.crosstab(pd_fraud_train["fraud_bool"], pd_fraud_train["foreign_request"])
cramers_v(confusion_matrix.values)
0.016327020576381513
confusion_matrix = pd.crosstab(pd_fraud_train["fraud_bool"], pd_fraud_train["source"])
cramers_v(confusion_matrix.values)
0.004179406004488106
confusion_matrix = pd.crosstab(pd_fraud_train["fraud_bool"], pd_fraud_train["device_os"])
cramers_v(confusion_matrix.values)
0.08006341026817546
confusion_matrix = pd.crosstab(pd_fraud_train["fraud_bool"], pd_fraud_train["keep_alive_session"])
cramers_v(confusion_matrix.values)
0.05006946268902914
confusion_matrix = pd.crosstab(pd_fraud_train["fraud_bool"], pd_fraud_train["device_distinct_emails_8w"])
cramers_v(confusion_matrix.values)
0.04594566555405527
confusion_matrix = pd.crosstab(pd_fraud_train["fraud_bool"], pd_fraud_train["month"])
cramers_v(confusion_matrix.values)
0.01800914259548991
Se observa que las relaciones con la variable objetivo son prácticamente inexistentes según el coeficiente de Cramer. Podemos destacar que, aunque muy baja, la relación con la variable 'housing_status', es la mayor con respecto a la variable objetivo (0,1138)
CONCLUSIONES FINALES¶
En relación con los gráficos obtenidos se alcanzan distintas conclusiones:
Existe una diferencia clara entre las medias de 'income' cuando medimos los datos fraudulentos contra los no fraudulentos, siendo mucho mayor en los casos en los que se comete fraude. Esto puede llevar a pensar que las cuentas con más income tienden a ser fraudulentas. Un estudio más profundo sobre esta relación podría ser interesante.
Como podemos ver en el gráfico de barras de 'name_email_similarity', la mayoría de la gente pone su nombre en la dirección del email.
Probablemente no tenga relación el nombre del email con la posibilidad de que las aplicaciones sean fraudulentas o no.
Al menos el 50% de las aplicaciones fraudulentas presentan 'prev_address_months_count' como valor faltante (variable que podría ser relevante para detectar los fraudes)
El gráfico de barras de la variable 'customer_age' muestra un dato muy relevante: que se utilizan cuentas de personas más mayores para generar aplicaciones fraudulentas.
En el histograma de 'intended_balcon_amount' podemos ver que casi nunca se introduce dinero al iniciar una aplicación. Eso sumado a que los gráficos boxplots son muy parecidos, indica que esta variable no sería muy útil para el estudio.
En el gráfico de barras de 'date_of_birth_distinct_emails_4w' observamos que hay 4 semanas dignas de mención (0,1,2,36). Un estudio más a fondo podría ser interesante.
A pesar de que en el histograma de 'employment_status' se observen que el que más se repiten son los CA y que los grupos CC y en especial CG apenas carecen de aplicaciones, en el gráfico de barras observamos que el employment_status CC y CG tiende a ser utilizado para crear aplicaciones fraudulentas.
A pesar de que en el histograma de 'housing_status' se observen que el que más se repiten son los BC, en el gráfico de barras observamos que el housing_status BA tiende a ser más utilizado para crear aplicaciones fraudulentas.
En el gráfico de barras de 'bank_months_count' observamos que los usuarios crean más cuentas fraudulentas cuando la cuenta bancaria de un cliente tiene 17 meses.
En el boxplot de 'proposed_credit_limit' se muestra que las aplicaciones fraudulentas se tienden a crear con un límite de cuenta mayor, lo que tiene lógica puesto que se quiere obtener confianza.
En el histograma de 'foreign_request' podemos ver que no hay muchas cuentas con esta variable igual a 1, y en el gráfico de barras entonces podemos observar una ligera diferencia. Cuentas con este valor se pueden llegar a utilizar más para crear aplicaciones fraudulentas.
Gráficos keep_alive_session: Las cuentas con aplicaciones fraudulentas tienden a dejarse la cuenta abierta lo cual tiene sentido porque así no se requiere introducir contraseñas.
Las aplicaciones fraudulentas es probable que se creen desde el mismo dispositivo, y esto puede explicar que en el gráfico de barras de 'device_distinct_emails_8w' el valor que más se repita sea el 2.
El resto de variables no contienen información relevante o destacable dentro de sus boxplot e histogramas.
Hemos creado dos variables conteniendo una de ella las variables categóricas del dataset, y otra las variables numéricas.
Hemos realizado la división entre train y test para la variable objetivo de nuestro dataset. Con ello podemos ver que los datos de train y test son los mismos porcentajes lo cual no quiere decir que sea el mismo número de registros (normalize=True).
Tratamos los datos outliers calculando la desviación de la media para cada variable numerica con respecto a la variable objetivo (fraud_bool) y, en función del multiplicador. Se puede observar que en la variable proposed_credit_limit los outliers tienen un mayor porcentaje (0.1296) de fraud_bool=1 (solicitud fraudulenta).
De la matriz de correlaciones concluimos que, en valor absoluto, las variables más correlacionadas de la matriz superior son 'month' y 'velocity_4w' con un 84,80% de correlación. Además, las velocidades "velocity_24h" y "velocity_4w" y las variables 'credit_risk_score' y 'proposed_credit_limit' también poseen un mayor grado de correlación positiva entre sí (alrededor de un 60%).
Por último, con el coeficiente de Cramer podemos observar que las relaciones de las variables con la variable objetivo son prácticamente inexistentes. Podemos destacar que, aunque muy baja, la relación con la variable 'housing_status', es la mayor con respecto a la variable objetivo (0,115).
pd_fraud_train.to_csv("data/train_pd_data_preprocessing_missing_outlier.csv")
pd_fraud_test.to_csv("data/test_pd_data_preprocessing_missing_outlier.csv")